import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels as sm
import seaborn as sns
%matplotlib inline
np.random.seed(42)
ford_data = pd.read_csv('201902-fordgobike-tripdata.csv')
ford_data.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 32:10.1 | 01:56.0 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 53:21.8 | 42:03.1 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 13:13.2 | 24:08.1 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 54:26.0 | 02:36.8 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 54:18.5 | 20:44.1 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
ford_data.drop(columns = ['start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude'], axis = 1, inplace = True)
ford_data.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | end_station_id | end_station_name | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 32:10.1 | 01:56.0 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 13.0 | Commercial St at Montgomery St | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 53:21.8 | 42:03.1 | 23.0 | The Embarcadero at Steuart St | 81.0 | Berry St at 4th St | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 13:13.2 | 24:08.1 | 86.0 | Market St at Dolores St | 3.0 | Powell St BART Station (Market St at 4th St) | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 54:26.0 | 02:36.8 | 375.0 | Grove St at Masonic Ave | 70.0 | Central Ave at Fell St | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 54:18.5 | 20:44.1 | 7.0 | Frank H Ogawa Plaza | 222.0 | 10th Ave at E 15th St | 4898 | Subscriber | 1974.0 | Male | Yes |
ford_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 end_station_id 183215 non-null float64 6 end_station_name 183215 non-null object 7 bike_id 183412 non-null int64 8 user_type 183412 non-null object 9 member_birth_year 175147 non-null float64 10 member_gender 175147 non-null object 11 bike_share_for_all_trip 183412 non-null object dtypes: float64(3), int64(2), object(7) memory usage: 11.9+ MB
ford_data.describe()
| duration_sec | start_station_id | end_station_id | bike_id | member_birth_year | |
|---|---|---|---|---|---|
| count | 183412.000000 | 183215.000000 | 183215.000000 | 183412.000000 | 175147.000000 |
| mean | 726.078435 | 138.590427 | 136.249123 | 4472.906375 | 1984.806437 |
| std | 1794.389780 | 111.778864 | 111.515131 | 1664.383394 | 10.116689 |
| min | 61.000000 | 3.000000 | 3.000000 | 11.000000 | 1878.000000 |
| 25% | 325.000000 | 47.000000 | 44.000000 | 3777.000000 | 1980.000000 |
| 50% | 514.000000 | 104.000000 | 100.000000 | 4958.000000 | 1987.000000 |
| 75% | 796.000000 | 239.000000 | 235.000000 | 5502.000000 | 1992.000000 |
| max | 85444.000000 | 398.000000 | 398.000000 | 6645.000000 | 2001.000000 |
ford_data['member_gender'].mode()
0 Male dtype: object
sns.countplot(data = ford_data, x = 'member_gender');
ford_data['member_gender'].fillna(str(ford_data['member_gender'].mode()).replace(str(ford_data['member_gender'].mode()), 'Male'), inplace = True)
ford_data['member_birth_year'].fillna(ford_data['member_birth_year'].mean().__round__(), inplace = True)
ford_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 end_station_id 183215 non-null float64 6 end_station_name 183215 non-null object 7 bike_id 183412 non-null int64 8 user_type 183412 non-null object 9 member_birth_year 183412 non-null float64 10 member_gender 183412 non-null object 11 bike_share_for_all_trip 183412 non-null object dtypes: float64(3), int64(2), object(7) memory usage: 11.9+ MB
ford_data.head(10)
| duration_sec | start_time | end_time | start_station_id | start_station_name | end_station_id | end_station_name | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 32:10.1 | 01:56.0 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 13.0 | Commercial St at Montgomery St | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 53:21.8 | 42:03.1 | 23.0 | The Embarcadero at Steuart St | 81.0 | Berry St at 4th St | 2535 | Customer | 1985.0 | Male | No |
| 2 | 61854 | 13:13.2 | 24:08.1 | 86.0 | Market St at Dolores St | 3.0 | Powell St BART Station (Market St at 4th St) | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 54:26.0 | 02:36.8 | 375.0 | Grove St at Masonic Ave | 70.0 | Central Ave at Fell St | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 54:18.5 | 20:44.1 | 7.0 | Frank H Ogawa Plaza | 222.0 | 10th Ave at E 15th St | 4898 | Subscriber | 1974.0 | Male | Yes |
| 5 | 1793 | 49:58.6 | 19:51.8 | 93.0 | 4th St at Mission Bay Blvd S | 323.0 | Broadway at Kearny | 5200 | Subscriber | 1959.0 | Male | No |
| 6 | 1147 | 55:35.1 | 14:42.6 | 300.0 | Palm St at Willow St | 312.0 | San Jose Diridon Station | 3803 | Subscriber | 1983.0 | Female | No |
| 7 | 1615 | 41:06.8 | 08:02.8 | 10.0 | Washington St at Kearny St | 127.0 | Valencia St at 21st St | 6329 | Subscriber | 1989.0 | Male | No |
| 8 | 1570 | 41:48.8 | 07:59.7 | 10.0 | Washington St at Kearny St | 127.0 | Valencia St at 21st St | 6548 | Subscriber | 1988.0 | Other | No |
| 9 | 1049 | 49:47.7 | 07:17.0 | 19.0 | Post St at Kearny St | 121.0 | Mission Playground | 6488 | Subscriber | 1992.0 | Male | No |
ford_data.dropna(axis = 0, inplace = True)
ford_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 183215 entries, 0 to 183411 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183215 non-null int64 1 start_time 183215 non-null object 2 end_time 183215 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 end_station_id 183215 non-null float64 6 end_station_name 183215 non-null object 7 bike_id 183215 non-null int64 8 user_type 183215 non-null object 9 member_birth_year 183215 non-null float64 10 member_gender 183215 non-null object 11 bike_share_for_all_trip 183215 non-null object dtypes: float64(3), int64(2), object(7) memory usage: 13.3+ MB
ford_data['start_station_name'].groupby(ford_data['duration_sec']).max().tail(10)
duration_sec 80891 Civic Center/UN Plaza BART Station (Market St ... 81549 Jersey St at Church St 82385 Fell St at Stanyan St 82512 Myrtle St at Polk St 83195 Bancroft Way at College Ave 83407 11th St at Natoma St 83519 Page St at Scott St 83772 Folsom St at 9th St 84548 Powell St BART Station (Market St at 4th St) 85444 Powell St BART Station (Market St at 5th St) Name: start_station_name, dtype: object
ford_data['end_station_name'].groupby(ford_data['duration_sec']).max().tail(10)
duration_sec 80891 Webster St at Grove St 81549 Cesar Chavez St at Dolores St 82385 Fell St at Stanyan St 82512 Civic Center/UN Plaza BART Station (Market St ... 83195 Telegraph Ave at Ashby Ave 83407 16th St Depot 83519 San Francisco Public Library (Grove St at Hyde... 83772 Hubbell St at 16th St 84548 Myrtle St at Polk St 85444 Valencia St at 16th St Name: end_station_name, dtype: object
# for faster computation i'm gonna use random samples of range 20,000 of the dataset in the plots
sample = np.random.choice(ford_data.shape[0], 20000, replace = False)
ford_data_subset = ford_data.iloc[sample]
sns.countplot(data = ford_data, x = 'member_gender');
sns.catplot(data = ford_data_subset, x = 'member_gender', y = 'duration_sec');
sns.countplot(data = ford_data, x = 'user_type');
sns.catplot(data = ford_data_subset, x = 'user_type', y = 'duration_sec');
plt.hist(ford_data['member_birth_year']);
plt.axvline(ford_data['member_birth_year'].mean().__round__(), color = 'red')
<matplotlib.lines.Line2D at 0x13da1970>
bins = np.arange(1930, ford_data['member_birth_year'].max(), 10)
plt.xlabel('Years of Birth')
plt.ylabel('Count')
plt.hist(ford_data['member_birth_year'], bins = bins);
plt.axvline(ford_data['member_birth_year'].mean().__round__(), color = 'red')
plt.axvline(ford_data['member_birth_year'].max(), color = 'black')
plt.legend(['Mean', 'Max']);
plt.show()
sns.regplot(data = ford_data_subset, y = 'member_birth_year', x = 'duration_sec');
ttype_markers = [['Male', '^'], ['Female', 'p'], ['Other', 'o']]
for ttype, marker in ttype_markers:
plot_data = ford_data_subset.loc[ford_data_subset['member_gender'] == ttype]
sns.regplot(data = plot_data, x = 'duration_sec', y = 'member_birth_year', marker = marker, fit_reg = False)
plt.legend(['Males', 'Females', 'Other']);